import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import statsmodels.api as sm
from statsmodels.formula.api import ols
crime = pd.read_csv('formatted.csv', sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])
crime.info()
crime.head()
crime.columns
Check out min, mx, mean, median, and std for non-violent crimes
print('Minimum for violent crimes: ', crime['ViolentCrimesPerPop'].min())
print('Maximum for violent crimes: ', crime['ViolentCrimesPerPop'].max())
print('Mean for violent crimes: ', crime['ViolentCrimesPerPop'].mean())
print('STD for violent crimes: ', crime['ViolentCrimesPerPop'].std())
print('Median for violent crimes: ', crime['ViolentCrimesPerPop'].median())
Check out min, mx, mean, median, and std for non-violent crimes
print('Minimum for non-violent crimes: ', crime['nonViolPerPop'].min())
print('Maximum for non-violent crimes: ', crime['nonViolPerPop'].max())
print('Mean for non-violent crimes: ', crime['nonViolPerPop'].mean())
print('STD for non-violent crimes: ', crime['nonViolPerPop'].std())
print('Median for non-violent crimes: ', crime['nonViolPerPop'].median())
Violent crime density
plt.figure(figsize=(20,15))
plt.subplot(3,2,1)
plt.title('Violent Crimes density')
crime['ViolentCrimesPerPop'].hist()
plt.xlabel('Number of Violent Crimes per 100k population')
plt.ylabel('count')
plt.subplot(3,2,2)
plt.title('Murder Crimes density')
crime['murdPerPop'].hist()
plt.xlabel('Number of Murders per 100k population')
plt.ylabel('count')
plt.subplot(3,2,3)
plt.title('Rape Crimes density')
crime['rapesPerPop'].hist()
plt.xlabel('Number of Rapes per 100k population')
plt.ylabel('count')
plt.subplot(3,2,4)
plt.title('Robbery Crimes density')
crime['robbbPerPop'].hist()
plt.xlabel('Number of Robberies per 100K population')
plt.ylabel('count')
plt.subplot(3,2,5)
plt.title('Assault Crimes density')
crime['assaultPerPop'].hist()
plt.xlabel('Number of Assaults per 100k population')
plt.ylabel('count')
Non-violent crime density
plt.figure(figsize=(20,15))
plt.subplot(3,2,1)
plt.title('Non-Violent Crimes density')
crime['nonViolPerPop'].hist()
plt.xlabel('Number of Non-Violent Crimes per 100k population')
plt.ylabel('count')
plt.subplot(3,2,2)
plt.title('Arson Crimes density')
crime['arsonsPerPop'].hist()
plt.xlabel('Arsons density')
plt.ylabel('count')
plt.subplot(3,2,3)
plt.title('Burglary Crimes density')
crime['burglPerPop'].hist()
plt.xlabel('Burglaries density')
plt.ylabel('count')
plt.subplot(3,2,4)
plt.title('Larceny Crimes density')
crime['larcPerPop'].hist()
plt.xlabel('Larcenies density')
plt.ylabel('Counts')
plt.subplot(3,2,5)
plt.title('Auto-theft Crimes density')
crime['autoTheftPerPop'].hist()
plt.xlabel('Auto thefts density')
plt.ylabel('Counts')
This is a smart way of visualizing density for each type of crime and see which one is most widespread
Check per capita income for each race
plt.figure(figsize=(20,15))
plt.subplot(3,2,1)
plt.title('whitePerCap')
crime['whitePerCap'].hist()
plt.xlabel('Per capita income for caucasians')
plt.ylabel('count')
plt.subplot(3,2,2)
plt.title('blackPerCap')
crime['blackPerCap'].hist()
plt.xlabel('Per capita income for blacks')
plt.ylabel('count')
plt.subplot(3,2,3)
plt.title('indianPerCap')
crime['indianPerCap'].hist()
plt.xlabel('Per capita income for indians')
plt.ylabel('count')
plt.subplot(3,2,4)
plt.title('AsianPerCap')
crime['AsianPerCap'].hist()
plt.xlabel('Per capita income for asians')
plt.ylabel('count')
plt.subplot(3,2,5)
plt.title('HispPerCap')
crime['HispPerCap'].hist()
plt.xlabel('Per capita income for hispanics')
plt.ylabel('count')
Plot linear regression plots for %age and non-violent crimes
sns.lmplot(x='agePct12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct65up', y='nonViolPerPop', data=crime)
# calculate correlation coeffcient between %age and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', \
'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for population age and non-violent crimes
crime['agePop12t21'] = crime['agePct12t21'] * crime['population']
crime['agePop12t29'] = crime['agePct12t29'] * crime['population']
crime['agePop16t24'] = crime['agePct16t24'] * crime['population']
crime['agePop65up'] = crime['agePct65up'] * crime['population']
sns.lmplot(x='agePop12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop65up', y='nonViolPerPop', data=crime)
# calculate correlation coefficient between population age and non-violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['nonViolPerPop'])[0])
Plot linear regression for % age and violent crimes
sns.lmplot(x='agePct12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct65up', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for population age and violent crimes
sns.lmplot(x='agePop12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop65up', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for %education and non-violent crimes
sns.lmplot(x='PctLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='nonViolPerPop', data=crime)
# calculate correlation coeffcient between %education and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up',
'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for population education and non-violent crimes
crime['PopLess9thGrade'] = crime['PctLess9thGrade'] * crime['population']
crime['PopNotHSGrad'] = crime['PctNotHSGrad'] * crime['population']
crime['PopBSorMore'] = crime['PctBSorMore'] * crime['population']
sns.lmplot(x='PopLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='nonViolPerPop', data=crime)
# calculate correlation coeffcient between population education and non-violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for %education and non-violent crimes
sns.lmplot(x='PctLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coeffcient between %education and violent crimes
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for population education and violent crimes
sns.lmplot(x='PopLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coeffcient between population education and violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for %employment/unemployment and non-violent crimes
sns.lmplot(x='PctEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='nonViolPerPop', data=crime)
# calculate correlation coefficient between %employed/unemployed and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up',
'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
'PctUnemployed', 'PctEmploy',
'PopUnemployed', 'PopEmploy',
'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for population employment/unemploymet and non-violent crimes
crime['PopEmploy'] = crime['PctEmploy'] * crime['population']
crime['PopUnemployed'] = crime['PctUnemployed'] * crime['population']
sns.lmplot(x='PopEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='nonViolPerPop', data=crime)
# Calculate correlation coefficient for population employed/unemployed and non-violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for %employment/unemployment and violent crimes
sns.lmplot(x='PctEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='ViolentCrimesPerPop', data=crime)
# Calculate correlation coefficient for %employed/unemployed and violent crimes
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for population employment/unemploymet and violent crimes
sns.lmplot(x='PopEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='ViolentCrimesPerPop', data=crime)
# Calculate correlation coefficient for population employed/unemployed and violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for %vacancy and non-violent crimes
sns.lmplot(x='PctHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='nonViolPerPop', data=crime)
# calculate correlation coefficient between %vacancy and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up',
'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
'PctUnemployed', 'PctEmploy',
'PopUnemployed', 'PopEmploy',
'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for population vacancy and non-violent crimes
crime['PopHousOccup'] = crime['PctHousOccup'] * crime['population']
crime['PopHousOwnOcc'] = crime['PctHousOwnOcc'] * crime['population']
crime['PopVacantBoarded'] = crime['PctVacantBoarded'] * crime['population']
crime['PopVacMore6Mos'] = crime['PctVacMore6Mos'] * crime['population']
sns.lmplot(x='PopHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='nonViolPerPop', data=crime)
# calculate correlation coefficient for population vacancy and non-violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for %vacancy and non-violent crimes
sns.lmplot(x='PctHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coefficient between %vacancy and violent crimes
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for population vacancy and non-violent crimes
sns.lmplot(x='PopHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
# calculate correlatiion coefficient between population vacancy and violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plot for % race and non-violent crimes
sns.lmplot(x='racepctblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='nonViolPerPop', data=crime)
# calculate correlation coefficient between %race and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up',
'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
'PctUnemployed', 'PctEmploy',
'PopUnemployed', 'PopEmploy',
'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
'racepopblack', 'racePopWhite', 'racePopAsian', 'racePopHisp',
'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['nonViolPerPop'])[0])
Plot linear regression plots for population race and non-violent crimes
crime['racepopblack'] = crime['racepctblack'] * crime['population']
crime['racePopWhite'] = crime['racePctWhite'] * crime['population']
crime['racePopAsian'] = crime['racePctAsian'] * crime['population']
crime['racePopHisp'] = crime['racePctHisp'] * crime['population']
sns.lmplot(x='racepopblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='nonViolPerPop', data=crime)
# calculate correlation coefficient for population race and non-violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['nonViolPerPop'])[0])
Plot linear regression plot for %race and violent crimes
sns.lmplot(x='racepctblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coefficient for %race and violent crimes
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plot for population race and violent crimes
sns.lmplot(x='racepopblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='ViolentCrimesPerPop', data=crime)
# calculate correlation coefficient for population race and violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['ViolentCrimesPerPop'])[0])
Plot linear regression plots for violent crimes vs. non-violent crimes by region
sns.lmplot(x='ViolentCrimesPerPop', y='nonViolPerPop', data=crime,
fit_reg=True, # regression line
hue='Region',x_jitter=.1, y_jitter=0.1) # Color by Region
crime.corr()
A lot of variables are present. Will tidy this up by subsetting columns into a new dataframe
# will leave out age group from this subset, since they did not have strong correlation
crimedata = crime[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PctUnemployed', 'PctEmploy',
'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
'nonViolPerPop', 'ViolentCrimesPerPop']]
crimedata.corr()
Looks much better
# load dataframe into a variable
crimedata_corr = crimedata.corr()
# make the correlation matrix plot
plt.figure(figsize=(12,10))
sns.heatmap(crimedata_corr,annot=True,vmin=-1.0)
# save image
plt.savefig('CrimeHeatmap.png')
This is so much concise and takes less time to generate. Correlation coefficients can be easily viewed for any pair of variables
sns.pairplot(crimedata)
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing
# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PctUnemployed', 'PctEmploy',
'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
'nonViolPerPop']]
y = crime1['ViolentCrimesPerPop']
# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# fit a model
lm = linear_model.LinearRegression(normalize=True)
# train model on test data
model = lm.fit(X_train, y_train)
# Evaluate model
print(model.score(X_train, y_train))
# Use model to make predictions
y_pred = lm.predict(X_test)
# Calculate coeffcient and intercept
coefficients = model.coef_
intercepts = model.intercept_
plt.scatter(y_test, y_pred)
# Use statsmodel
X = sm.add_constant(X)
model2 = sm.OLS(y_train,X_train).fit()
print (model2.summary())
# show values
print("The coeffcient of our model is: ", coefficients[0])
print("The intercept for our model is: ", intercepts)
print ("Linear model Train dataset score is: ", model.score(X_train,y_train))
print ("Linear model Test dataset score is: ", model.score(X_test,y_test))
Compute cross-validation score
from sklearn.model_selection import cross_val_score
cv_results = cross_val_score(lm, X, y, cv=10)
print(cv_results)
print('Accuracy of model: ', np.mean(cv_results))
Ridge regression
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
Lasso regression
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
'PctUnemployed', 'PctEmploy',
'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
'ViolentCrimesPerPop']]
y = crime1['nonViolPerPop']
# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# fit a model
lm = linear_model.LinearRegression(normalize=True)
# train model on test data
model = lm.fit(X_train, y_train)
# Evaluate model
print(model.score(X_train, y_train))
# Use model to make predictions
y_pred = lm.predict(X_test)
# Calculate coeffcient and intercept
coefficients = model.coef_
intercepts = model.intercept_
plt.scatter(y_test, y_pred)
# Use statsmodel
X = sm.add_constant(X)
model2 = sm.OLS(y_train,X_train).fit()
print(model2.summary())
Compute cross-validation score
cv_results = cross_val_score(lm, X, y, cv=10)
print(cv_results)
print('Accuracy of model: ', np.mean(cv_results))
Ridge regression
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
Lasso regression
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)